@INPROCEEDINGS{mirarouter,
 AUTHOR = {Park, Dongkook and Eachempati, Soumya and Das, Reetuparna and Mishra, Asit K. and Xie, Yuan and Vijaykrishnan, N. and Das, Chita R.},
 TITLE = {MIRA: A Multi-layered On-Chip Interconnect Router Architecture},
 BOOKTITLE = {ISCA '08: Proceedings of the 35th International Symposium on Computer Architecture},
 YEAR = {2008},
 PAGES = {251--261},
 }

@inproceedings{page-color-lei,
 author = {Jin, Lei and Cho, Sangyeun},
 title = {SOS: A Software-Oriented Distributed Shared Cache Management Approach for Chip Multiprocessors},
 booktitle = {PACT '09: Proceedings of the 2009 18th International Conference on Parallel Architectures and Compilation Techniques},
 year = {2009},
 pages = {361--371},
 }

@INPROCEEDINGS{Awasthi09dynamichardware-assisted,
    author = {Manu Awasthi and Kshitij Sudan and Rajeev Balasubramonian and John Carter},
    title = {Dynamic Hardware-Assisted Software-Controlled Page Placement to Manage Capacity Allocation and Sharing within Large Caches},
    booktitle = {IEEE INTERNATIONAL SYMPOSIUM ON HIGH PERFORMANCE COMPUTER ARCHITECTURE},
    year = {2009},
    pages = {250--261},
}


@INPROCEEDINGS{3d-mem-loh,
 AUTHOR = {Loh, Gabriel H.},
 TITLE = {3D-Stacked Memory Architectures for Multi-core Processors},
 BOOKTITLE = {ISCA '08: Proceedings of the 35th International Symposium on Computer Architecture},
 YEAR = {2008},
 PAGES = {453--464},
 }

@inproceedings{black3d,
 author = {Black, Bryan and Annavaram, Murali and Brekelbaum, Ned and DeVale, John and Jiang, Lei and Loh, Gabriel H. and McCaule, Don and Morrow, Pat and Nelson, Donald W. and Pantuso, Daniel and Reed, Paul and Rupley, Jeff and Shankar, Sadasivan and Shen, John and Webb, Clair},
 title = {Die Stacking (3D) Microarchitecture},
 booktitle = {MICRO 39: Proceedings of the 39th Annual IEEE/ACM International Symposium on Microarchitecture},
 year = {2006},
 pages = {469--479},
 }

@article{picoserver,
 author = {Kgil, Taeho and D'Souza, Shaun and Saidi, Ali and Binkert, Nathan and Dreslinski, Ronald and Mudge, Trevor and Reinhardt, Steven and Flautner, Krisztian},
 title = {PicoServer: using 3D stacking technology to enable a compact energy efficient chip multiprocessor},
 journal = {SIGPLAN Not.},
 volume = {41},
 number = {11},
 year = {2006},
 pages = {117--128},
 }

@article{mysore,
 author = {Mysore, Shashidhar and Agrawal, Banit and Srivastava, Navin and Lin, Sheng-Chih and Banerjee, Kaustav and Sherwood, Tim},
 title = {Introspective 3D chips},
 journal = {SIGOPS Oper. Syst. Rev.},
 volume = {40},
 number = {5},
 year = {2006},
 pages = {264--273},
 }

@article{loh-design,
 author = {Loh, Gabriel H. and Xie, Yuan and Black, Bryan},
 title = {Processor Design in 3D Die-Stacking Technologies},
 journal = {IEEE Micro},
 volume = {27},
 number = {3},
 year = {2007},
 pages = {31--48},
 }




@article{bridge-mem-3d,
 author = {Liu, Christianto C. and Ganusov, Ilya and Burtscher, Martin and Tiwari, Sandip},
 title = {Bridging the Processor-Memory Performance Gapwith 3D IC Technology},
 journal = {IEEE Des. Test},
 volume = {22},
 number = {6},
 year = {2005},
 pages = {556--564},
 }

@inproceedings{thermally-3d-mem,
 author = {Loi, Gian Luca and Agrawal, Banit and Srivastava, Navin and Lin, Sheng-Chih and Sherwood, Timothy and Banerjee, Kaustav},
 title = {A thermally-aware performance analysis of vertically integrated (3-D) processor-memory hierarchy},
 booktitle = {DAC '06: Proceedings of the 43rd annual Design Automation Conference},
 year = {2006},
 pages = {991--996},
 }

3D
-------------------
Genral structure

@CONFERENCE{cacti6-micro-2007,
  AUTHOR = {N. Muralimanohar and R. Balasubramonian and N. Jouppi},
  TITLE = {Optimizing {NUCA} Organizations and Wiring Alternatives for Large Caches
With {CACTI} 6.0},
  BOOKTITLE = {Proceedings of the 40th International Symposium on Microarchitecture},
  PAGES = {3-14},
  YEAR = {2007},
}


@article{simics,
 author = {Magnusson, Peter S. and Christensson, Magnus and Eskilson, Jesper and Forsgren, Daniel and H{\aa}llberg, Gustav and H\"{o}gberg, Johan and Larsson, Fredrik and Moestedt, Andreas and Werner, Bengt},
 title = {Simics: A Full System Simulation Platform},
 journal = {Computer},
 volume = {35},
 number = {2},
 year = {2002},
 pages = {50--58},
 }

@article{simulate-commercial,
 author = {Alameldeen, Alaa R. and Martin, Milo M. K. and Mauer, Carl J. and Moore, Kevin E. and Xu, Min and Hill, Mark D. and Wood, David A. and Sorin, Daniel J.},
 title = {Simulating a \$2M Commercial Server on a \$2K PC},
 journal = {Computer},
 volume = {36},
 number = {2},
 year = {2003},
 pages = {50--57},
 }

@article{gems,
 author = {Martin, Milo M. K. and Sorin, Daniel J. and Beckmann, Bradford M. and Marty, Michael R. and Xu, Min and Alameldeen, Alaa R. and Moore, Kevin E. and Hill, Mark D. and Wood, David A.},
 title = {Multifacet's general execution-driven multiprocessor simulator ({GEMS}) toolset},
 journal = {SIGARCH Comput. Archit. News},
 volume = {33},
 number = {4},
 year = {2005},
 pages = {92--99},
 }

@CONFERENCE{graphmine-2006,
  AUTHOR = {G. Buehrer and S. Parthasarathy and Y. Chen},
  TITLE = {Adaptive Parallel Graph Mining for {CMP} Architectures},
  BOOKTITLE = {Proceedings of the Sixth International Conference on Data Mining},
  PAGES = {97-106},
  YEAR = 2006}


@INPROCEEDINGS{Broder02networkapplications,
    author = {Andrei Broder and Michael Mitzenmacher},
    title = {Network Applications of Bloom Filters: A Survey},
    booktitle = {Internet Mathematics},
    year = {2002},
    pages = {636--646}
}



@phdthesis{ bienia11benchmarking,
  author = {Christian Bienia},
  title = {Benchmarking Modern Multiprocessors},
  school = {Princeton University},
  year      = {2011},
  month     = {January}
}
PARSEC

-------------------

@ARTICLE{alameldeen-comload-2003,
    AUTHOR = {A. Alameldeen and M. Martin and C. Mauer and K. Moore and M. Xu and M. Hill and D. Wood and D. Sorin},
    TITLE = {Simulating a \$2M Commercial Server on a \$2K PC},
    JOURNAL = {IEEE Computer},
    VOLUME = {36},
    NUMBER = {2},
    PAGES = {50-57},
    MONTH = feb,
    YEAR = 2003}

@ARTICLE{martin-prediction-2003,
 AUTHOR = {Martin, Milo M. K. and Harper, Pacia J. and Sorin, Daniel J. and Hill, Mark D. and Wood, David A.},
 TITLE = {Using destination-set prediction to improve the latency/bandwidth tradeoff in shared-memory multiprocessors},
 JOURNAL = {SIGARCH Comput. Archit. News},
 VOLUME = {31},
 NUMBER = {2},
 PAGES = {206--217},
 YEAR = 2003}

@CONFERENCE{hossain-2008,
 AUTHOR = {Hossain, Hemayet and Dwarkadas, Sandhya and Huang, Michael C.},
 TITLE = {Improving support for locality and fine-grain sharing in chip multiprocessors},
 BOOKTITLE = {PACT '08: Proceedings of the 17th international conference on Parallel architectures and compilation techniques},
 PAGES = {155--165},
 YEAR = 2008}

@CONFERENCE{hardavellas-2009,
 AUTHOR = {Hardavellas, Nikos and Ferdman, Michael and Falsafi, Babak and Ailamaki, Anastasia},
 TITLE = {Reactive NUCA: near-optimal block placement and replication in distributed caches},
 BOOKTITLE = "Proceedings of the 36th Annual International Symposium on Computer Architecture",
 YEAR = 2009,
 PAGES = {},
 MONTH = jun}

@CONFERENCE{kandemir-2008,
 AUTHOR = {Kandemir, Mahmut and Li, Feihui and Irwin, Mary Jane and Son, Seung Woo},
 TITLE = {A novel migration-based NUCA design for chip multiprocessors},
 BOOKTITLE = {SC '08: Proceedings of the 2008 ACM/IEEE conference on Supercomputing},
 YEAR = 2008,
 PAGES = {1--12},
 }

@CONFERENCE{abdelshafi-1997,
 AUTHOR = {Abdel-Shafi, Hazim and Hall, Jonathan and Adve, Sarita V. and Adve, Vikram S.},
 TITLE = {An Evaluation of Fine-Grain Producer-Initiated Communication in Cache-Coherent Multiprocessors},
 BOOKTITLE = {HPCA '97: Proceedings of the 3rd IEEE Symposium on High-Performance Computer Architecture},
 YEAR = 1997,
 PAGES = {204},
 }

@CONFERENCE{cheng-prodcons-2007,
 AUTHOR = {Cheng, Liqun and Carter, John B. and Dai, Donglai},
 TITLE = {An Adaptive Cache Coherence Protocol Optimized for Producer-Consumer Sharing},
 BOOKTITLE = {HPCA '07: Proceedings of the 2007 IEEE 13th International Symposium on High Performance Computer Architecture},
 YEAR = 2007,
 PAGES = {328--339},
 }

Sharing
-----------------------------
@inproceedings{tagless_dir,
 author = {Zebchuk, Jason and Srinivasan, Vijayalakshmi and Qureshi, Moinuddin K. and Moshovos, Andreas},
 title = {A tagless coherence directory},
 booktitle = {MICRO 42: Proceedings of the 42nd Annual IEEE/ACM International Symposium on Microarchitecture},
 year = {2009},
 pages = {423--434},
 }

tagless





@inproceedings{limited-pointer-dir,
 author = {Agarwal, A. and Simoni, R. and Hennessy, J. and Horowitz, M.},
 title = {An evaluation of directory schemes for cache coherence},
 booktitle = {ISCA '88: Proceedings of the 15th Annual International Symposium on Computer architecture},
 year = {1988},
 pages = {280--298},
 }

limited pointer - superset reprensting a set of caches
pointer, if not enough, broadcast, no collate

@INPROCEEDINGS{Gupta90reducingmemory,
    author = {Anoop Gupta and Wolf-dietrich Weber and Todd Mowry},
    title = {Reducing Memory and Traffic Requirements for Scalable Directory-Based Cache Coherence Schemes},
    booktitle = {International Conference on Parallel Processing},
    year = {1990},
    pages = {312--321}
}

Coarse vector: shared by small procs. limited vector -> overflow, coarse bit vector
Sparse dir: one directory for several blocks

@INPROCEEDINGS{Choi99segmentdirectory,
    author = {Jong Hyuk Choi and Kyu Ho Park},
    title = {Segment Directory Enhancing the Limited Directory Cache Coherence Schemes},
    booktitle = {Proc. 13th International Parallel and Distributed Processing Symp.},
    year = {1999},
    pages = {258--267}
}

A segment directory element consists of a segment
vector which is a segment of a full map vector, and a segment
pointer which determines the position of the segment
within the full map vector


@article{sgi_origin,
 author = {Laudon, James and Lenoski, Daniel},
 title = {The {SGI} Origin: a cc{NUMA} highly scalable server},
 journal = {SIGARCH Comput. Archit. News},
 volume = {25},
 number = {2},
 year = {1997},
 pages = {241--251},
 }

SGI origin - depending on the position switch between coarse vector and full vector

@inproceedings{tag_dir,
 author = {O'Krafka, Brian W. and Newton, A. Richard},
 title = {An empirical evaluation of two memory-efficient directory methods},
 booktitle = {ISCA '90: Proceedings of the 17th annual international symposium on Computer Architecture},
 year = {1990},
 pages = {138--147},
 }

use tag to refine in-cache dir

@ARTICLE{Censier_dir,
    author = {Lucien M. Censier and Paul Feautrier},
    title = {A New Solution to Coherence Problems in Multicache Systems},
    journal = {IEEE Transactions on Computers},
    year = {1978},
    volume = {27},
    pages = {1112--1118}
}

in-cache dir

@phdthesis{simoni_thesis,
 author = {Simoni,Jr., Richard Thomas},
 title = {Cache coherence directories for scalable multiprocessors},
 school = {Stanford University},
 year = {1992},
 address = {Stanford, CA, USA},
 }

@article{acacio_level,
 author = {Acacio, Manuel E. and Gonzalez, Jose and Garcia, Jose M. and Duato, Jose},
 title = {A Two-Level Directory Architecture for Highly Scalable cc-{NUMA} Multiprocessors},
 journal = {IEEE Trans. Parallel Distrib. Syst.},
 volume = {16},
 number = {1},
 year = {2005},
 pages = {67--79},
 }

@INPROCEEDINGS{ZillesTM,
    author = {Craig Zilles},
    title = {Brief Announcement: Transactional Memory and the Birthday Paradox},
    booktitle = {19th ACM Symposium on Parallelism in Algorithms and Architectures},
    year = {2007}
}

@inproceedings{waypoint,
 author = {Kelm, John H. and Johnson, Matthew R. and Lumettta, Steven S. and Patel, Sanjay J.},
 title = {{WAYPOINT}: scaling coherence to thousand-core architectures},
 booktitle = {Proceedings of the 19th international conference on Parallel architectures and compilation techniques},
 year = {2010},
 pages = {99--110},
}

@inproceedings{cuckoo-dir,
 author = {Ferdman, Michael and Lotfi-Kamran, Pejman and Balet, Ken and Falsafi, Babak},
 title = {Cuckoo Directory: Efficient and Scalable {CMP} Coherence},
 booktitle = {HPCA '11: Proceedings of the 2011 IEEE 17th International Symposium on High Performance Computer Architecture},
 year = {2011},
}

directory
------------------------------------
snoopy filter

@INPROCEEDINGS{jetty_filtering,
    author = {Andreas Moshovos and Gokhan Memik and Babak Falsafi and Alok Choudhary},
    title = {JETTY: Filtering Snoops for Reduced Energy Consumption in SMP Servers},
    booktitle = {International Symposium on High-Performance Computer Architecture},
    year = {2001},
    pages = {85--96}
}

destination filtering

@inproceedings{regionscout,
 author = {Moshovos, Andreas},
 title = {RegionScout: Exploiting Coarse Grain Sharing in Snoop-Based Coherence},
 booktitle = {ISCA '05: Proceedings of the 32nd annual international symposium on Computer Architecture},
 year = {2005},
 pages = {234--245},
 }

source filtering, data region

@INPROCEEDINGS{bluegene_filter,
    author = {Valentina Salapura and Matthias Blumrich and Alan Gara},
    title = {Design and implementation of the Blue Gene/P snoop filter},
    booktitle = {Proceedings of the International Symposium on High-Performance Computer Architecture}, 
    year = {2008},
}

blueGene, destination filtering

@inproceedings{coarse_tracking,
 author = {Cantin, Jason F. and Lipasti, Mikko H. and Smith, James E.},
 title = {Improving Multiprocessor Performance with Coarse-Grain Coherence Tracking},
 booktitle = {ISCA '05: Proceedings of the 32nd annual international symposium on Computer Architecture},
 year = {2005},
 pages = {246--257},
 }

source filtering

@inproceedings{in_network_coherence_filter,
 author = {Agarwal, Niket and Peh, Li-Shiuan and Jha, Niraj K.},
 title = {In-network coherence filtering: snoopy coherence without broadcasts},
 booktitle = {MICRO 42: Proceedings of the 42nd Annual IEEE/ACM International Symposium on Microarchitecture},
 year = {2009},
 pages = {232--243},
 }

in router filter, snoop on unordered

@INPROCEEDINGS{bus_raj,
    author = {Aniruddha Udipi and Naveen Muralimanohar and Rajeev Balasubramonian},
    title = {Towards Scalable, Energy-Efficient, Bus-Based On-Chip Networks},
    booktitle = {Proceedings of the International Symposium on High-Performance Computer Architecture},
    year = {2010},
}

hierachy bus, filter at inter node



------------------------------------
machines

@misc{niagara2,
    author = {{Sun Microsystems, Inc.}},
    title = {OpenSPARC {T}2 system-on-chip ({SoC}) microarchitecture specification},
    howpublished = {http://www.opensparc.net/opensparc-t2/index.html},
    month = {May},
    year = {2008},
}

@misc{intel_core_duo,
    author = {{Intel Corporation}},
    title ={Intel {C}ore {D}uo {P}rocessor and {I}ntel {C}ore {S}olo {P}rocessor on 65 nm {P}rocess},
    howpublished = {http://download.intel.com/design/mobile/datashts/30922106.pdf},
    month = {Jan},
    year = {2007},
}

@INPROCEEDINGS{amd_hammer,
    author ={A. Ahmed and  P. Conway and B. Hughes and F. Weber},
    title ={{AMD} Opteron Shared Memory MP Systems},
    booktitle={Proceedings of the 14th HotChips Symposium},
    year = {2002},   
}

@misc{itanium9300,
    author = {{Intel Corporation}},
    title ={{I}ntel {I}tanium {P}rocessor 9300 {S}eries {D}atasheet},
    howpublished = {http://download.intel.com/design/itanium/downloads/322821.pdf},
    month = {Feb},
    year = {2010},
}

